In [ ]:
import pandas as pd
import numpy as np
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
import plotly.io as pio
pio.renderers.default='notebook'

Previous knowledge:

  • Price has a strong correlation with num_bath, size_house, year_built, avg_size_neighbor_houses
  • num_bed, num_bath, num_floors, is_waterfront, condition are categorical variables
  • The df doesn't have null values.
  • The number of zeros is low in the df.
  • Outliers can't be discarded without business knowledge (most of outliers looks like real information).
  • Inliers can't be detected as the data looks reasonable (data don't brake any logic or physical rule).
  • Colinearity hasn't been checked
  • The data don't have a time dimension (temporal analysis can't be performed)
In [ ]:
df = pd.read_csv(".././data/house_sales.csv")
In [ ]:
# Group price in 10 quantiles
index_group = 1
df["price_quantile"] = np.nan
for i_quantile in np.linspace(0.1,1,10):
    filter_quantile = (df["price"] >= np.quantile(df["price"], i_quantile-0.1))& (df["price"] <= np.quantile(df["price"], i_quantile))
    df.loc[filter_quantile, "price_quantile"] = index_group
    index_group = index_group+1

Relationship between main correlated variables and price¶

In [ ]:
col_names = ["num_bath", "size_house", "year_built", "avg_size_neighbor_houses"]
for i_col in col_names:
    fig = px.scatter(df, x=i_col, y="price")
    fig.show()
In [ ]:
df = df.sort_values("price_quantile")
for i_col in col_names:
    fig = px.box(df, y=i_col, color = "price_quantile")
    fig.show()

Relationship between categorical variables and price¶

In [ ]:
col_names = ['num_bed', 'num_bath', 'num_floors', 'is_waterfront', 'condition']
for i_col in col_names:
    df = df.sort_values(i_col)
    fig = px.box(df, y="price", color = i_col)
    fig.show()

Colinearity analysis¶

In [ ]:
# variance inflation factor
df_temp = df.drop(["price", "price_quantile"], axis =1)
vif_data = pd.DataFrame()
vif_data["feature"] = df_temp.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df_temp.values, i)
                          for i in range(len(df_temp.columns))]
print(vif_data)
                     feature           VIF
0                    num_bed  2.245863e+01
1                   num_bath  2.839346e+01
2                 size_house  3.139842e+01
3                   size_lot  2.260831e+00
4                 num_floors  1.661399e+01
5              is_waterfront  1.044470e+00
6                  condition  3.458419e+01
7              size_basement  2.677797e+00
8                 year_built  9.046268e+03
9            renovation_date  1.194766e+00
10                       zip  1.593359e+06
11                  latitude  1.345629e+05
12                 longitude  1.350472e+06
13  avg_size_neighbor_houses  2.417710e+01
14     avg_size_neighbor_lot  2.505680e+00
In [ ]:
normalized_df=(df_temp-df_temp.mean())/df_temp.std()
vif_data = pd.DataFrame()
vif_data["feature"] = normalized_df.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(normalized_df.values, i)
                          for i in range(len(normalized_df.columns))]
print(vif_data)
                     feature       VIF
0                    num_bed  1.609913
1                   num_bath  3.332022
2                 size_house  5.136044
3                   size_lot  2.004992
4                 num_floors  1.991210
5              is_waterfront  1.037671
6                  condition  1.250221
7              size_basement  1.906437
8                 year_built  2.289709
9            renovation_date  1.148484
10                       zip  1.645631
11                  latitude  1.143178
12                 longitude  1.785075
13  avg_size_neighbor_houses  2.629664
14     avg_size_neighbor_lot  2.044504

Location analysis¶

In [ ]:
import folium  #needed for interactive map
from folium.plugins import HeatMap

folium_map = folium.Map(location=[47.56, -122.21],
                        zoom_start=13,
                        tiles="OpenStreetMap")

hm_wide = HeatMap( list(zip(df['latitude'], df['longitude'], df['price'])),
                   min_opacity=0.2,
                   radius=8, blur=6, 
                   max_zoom=15, 
                 )

folium_map.add_child(hm_wide)
folium_map
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Conclusions¶

  • "num_bath", "size_house", "avg_size_neighbor_houses" exhibit a strong positive linear correlation in the scatter plot. In the box plot is identified for these features that when the quantile number increase the feature distribution also increase.

  • When the number of bedrooms increase from 0 to 4, the price distribution increase (shifted up). After 4 bedrooms the distribution looks similar for all the remainder cases.

  • The price increase when the number of bathrooms increase.

  • The price has the higher distributions when the number of floors are 2 or 2.5

  • Price distribution is higher if the house is waterfront.

  • Price distribution is lower if the condition is 1 or 2. When the condition is 3, 4 or 5 the price distribution is similar.

  • The dataset presents a high collinearity relationship, once the data is normalized the colinearity is mitigated.

  • From the heatmap we can identify the seattle regions where the price is higher, this mean that the location is a major feature.